Loading packages and subroutines

Packages

require(Seurat)
require(DT) # for datatable
require(SeuratWrappers) # for fastMNN
require(harmony) # for RunHarmony
require(reshape2) # for dcast

Import table

ImportTable <- function(file, header = T, row.names = 1, sep = ',', check.names = FALSE, ...){
  data <- read.csv(file = file, header = header, row.names = row.names, sep = sep, check.names = check.names, ...)
  return(data)
}

show_table

show_table <- function(df, rownames = T, filter="top", options = list(pageLength = 10, scrollX=T), ...){
  if (ncol(df) > 20) {
    df <- df[, 1:20]
    message('Due to the column dim is > 20, it only shows the ')
  }
  datatable(df, rownames = rownames, 
            filter=filter, options = options, ... )

}

Import data

Taxonomy name

tax_name <- ImportTable(file = '../00.ProcessData/2021-04-12-allTaxonomySplitLevel-v1.0.1.tsv', sep = '\t')
tax_name$new_name <- gsub(pattern = 's__', replacement = '', x = tax_name$Specie) %>% 
  gsub('_', ' ', .) # due to Feature names of CreateSeuratObject cannot have underscores, replace with blanket space.
show_table(tax_name)

Taxonomy matrix

tax_df <- ImportTable(file = '../00.ProcessData/2021-04-12-ReAbun-Fungi-filter1802-v1.0.0.tsv', sep = '\t')
# tax_df <- ImportTable(file = '../00.ProcessData/2021-04-12-RawData-Fungi-filter1802-v1.0.0.tsv', sep = '\t')
rownames(tax_df) <- tax_name[rownames(tax_df), "new_name"] # use the short species names
show_table(tax_df)
## Due to the column dim is > 20, it only shows the

Meta information

meta_df <- ImportTable(file = '../00.ProcessData/metaInfo-subgroup-v4.1.csv')
meta_df <- meta_df[colnames(tax_df),]
show_table(meta_df)

Remove the batch effect by Seurat

Create seurat object

CreateSeuratObject: Create a Seurat object link

s_obj <- CreateSeuratObject(counts = tax_df, meta.data = meta_df)

# split the object by cohorts
s_list <- SplitObject(s_obj, split.by = 'Cohort')

Remove batch effect

RPCA

Fast integration using reciprocal PCA (RPCA) link

Pre-process
for (i in 1:length(s_list)) {
    s_list[[i]] <- NormalizeData(s_list[[i]], verbose = FALSE)
    s_list[[i]] <- FindVariableFeatures(s_list[[i]], selection.method = "vst", nfeatures = 2000, verbose = FALSE)
}

s_anchors <- FindIntegrationAnchors(object.list = s_list, k.anchor = 20)

s_integrated <- IntegrateData(anchorset = s_anchors, k.weight = 10) %>%
  ScaleData(. , verbose = FALSE) %>% 
  RunPCA(. , npcs = 30, verbose = FALSE) %>%
  RunUMAP(. , reduction = "pca", dims = 1:30, verbose = FALSE)

prca_obj <- FindNeighbors(s_integrated, dims = 1:30) %>%
  FindClusters()
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 72471
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7382
## Number of communities: 8
## Elapsed time: 0 seconds
prca_table <- dcast(as.data.frame(table(prca_obj@active.ident,prca_obj@meta.data[["Stage"]])), Var1 ~ Var2)
rownames(prca_table) <- prca_table$Var1; prca_table <- prca_table[,-1]
Table
show_table(prca_table)
Figure
DimPlot(prca_obj, group.by = c("Cohort", "Stage", 'ident'), ncol = 3)

fastMNN

fastMNN: Batch effects in single-cell RNA-sequencing data are corrected by matching mutual nearest neighbors link
Nature Biotechnology, 2018, DOI: 10.1038/nbt.4091

Pre-process
fMNN_obj <- NormalizeData(s_obj) %>% 
  FindVariableFeatures() %>%
  SplitObject(. ,  split.by = 'Cohort') %>%
  RunFastMNN()

fMNN_obj2 <- RunUMAP(fMNN_obj, reduction = 'mnn', dims = 1:30) %>%
  FindNeighbors(. , reduction = 'mnn', dims = 1:30) %>%
  FindClusters()
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 76431
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6830
## Number of communities: 6
## Elapsed time: 0 seconds
fMNN_table <- dcast(as.data.frame(table(fMNN_obj2@active.ident,fMNN_obj2@meta.data[["Stage"]])), Var1 ~ Var2)
rownames(fMNN_table) <- fMNN_table$Var1; fMNN_table <- fMNN_table[,-1]
Table
show_table(fMNN_table)
Figure
DimPlot(fMNN_obj2, group.by = c("Cohort", "Stage", 'ident'), ncol = 3)

Harmony

Fast, sensitive, and flexible integration of single cell data with Harmony. link

Nature Methods, 2019, DOI: 10.1038/s41592-019-0619-0

Pre-process
hmy_obj <- NormalizeData(s_obj) %>%
  FindVariableFeatures() %>% 
  ScaleData() %>% 
  RunPCA(. , verbose = FALSE)

hmy_obj2 <- RunHarmony(object = hmy_obj, group.by.vars = 'Cohort') %>%
  RunUMAP(., reduction = "harmony", dims = 1:30) %>%
  FindNeighbors(., reduction = "harmony", dims = 1:30) %>% 
  FindClusters()
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1802
## Number of edges: 78214
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7433
## Number of communities: 9
## Elapsed time: 0 seconds
hmy_table <- dcast(as.data.frame(table(hmy_obj2@active.ident,hmy_obj2@meta.data[["Stage"]])), Var1 ~ Var2)
rownames(hmy_table) <- hmy_table$Var1; hmy_table <- hmy_table[,-1]
Table
show_table(hmy_table)
Figure
DimPlot(hmy_obj2, group.by = c("Cohort", "Stage", 'ident'), ncol = 3)